期中專案(OLIST)資料探索
pacman::p_load(dplyr, ggplot2, readr, plotly, googleVis,ggthemes,grid,tidytext,wordcloud2)load("data/olist.rdata")
load("data/Z.rdata")A<-left_join(P,TPC,by="product_category_name")
oisim<-I[,c(1,3)]
pando<-merge(A,oisim,by="product_id",all.x=T)
pandr<-merge(R,pando,by="order_id",all.x=T,all.y=T)g = ggplot(segment, aes(x=totalRev, y=avgScore, col=business_segment)) +
geom_point(aes(size=(avgItemsSold)))
ggplotly(g)#1.使用ggplot套件繪圖,X軸為總營收,Y軸為平均分數,顏色則依照產品品類做分佈
#2.點狀圖大小為產品平均銷售量
#3.圖片中發現有outlier為Watchessegment1 <- segment[segment$totalRev<100000,] #排除掉watch品項(outlier)
g = ggplot(segment1, aes(x=totalRev, y=avgScore, col=business_segment)) +
geom_point(aes(size=(avgItemsSold)))+
geom_smooth(aes(x=totalRev, y=avgScore, col=business_segment),method = lm)
ggplotly(g)
🗿 商業分析:
■ 排除掉watch(outlier)後,可以看到tatal Revenue和average score成正向關係。
■ 可以看出評分以及總營收的分佈,並且可藉由平均銷售(泡泡大小)能了解市場大小。
■ 可以發現在 health_beauty營收、評分最高,市場大小也不錯,games_consoles營收、評分最低,市場也小 。
💡 建議:
新進賣家可以選擇從銷售量較高的產品開始販售,獲得的評分會較高。
df01 = left_join(unique(I[,c(1,3)]), unique(left_join(O[,1,F], R[,1:3])[,-2])) %>%
group_by(product_id) %>% summarise(
noReview = n(),
avgScore = mean(review_score)) %>%
left_join(P[,c(1,5)])Joining, by = "order_id"
Joining, by = "order_id"
Joining, by = "product_id"
df01 = df01[complete.cases(df01),]
cor(df01$avgScore, df01$product_photos_qty)[1] 0.021683
cor.test(df01$avgScore, df01$product_photos_qty)
Pearson's product-moment correlation
data: df01$avgScore and df01$product_photos_qty
t = 3.9, df = 32300, p-value = 0.000096
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.010787 0.032574
sample estimates:
cor
0.021683
#1.用product_id分組,算出其評論分數平均值
#2.算出評論分數與產品圖片數量之相關性
#3.算出評論分數與產品圖片數量之相關性 #畫出產品圖片數量與平均分數之間的線性關係
ggplot(df01,aes(x = product_photos_qty, y = avgScore, col = product_photos_qty)) +
scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
stat_smooth(se=FALSE, col = "#FFA500", size=1) +
geom_hline(aes(yintercept=mean(avgScore)), col = "#66CDAA",linetype="dashed",size=1) +
scale_x_continuous(breaks=c(1:20)) + theme_light() +
ggtitle("product_photos_qty V.S Review Score")`geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
🗿 商業分析:
■ 對電子商務網站而言,商品展示是基本且十分重要的功能,且普遍認為「高品質的商品圖片」以及「適當的產品資訊量」,有助於提升消費者對於商品的好感度,本次分析商品圖片數量是否影響評論分數。
💡 建議:
製作或拍攝產品圖片時,需注意圖片所呈現的資訊是否清楚。
過多或過少的商品圖片數量,可能會造成商品資訊易讀性降低或是商品資訊不清。
一般來說,3至9張商品圖片是最合適的圖片數量。
df00 = left_join(
unique(I[,c(1,3)]),
unique(left_join(O[,1,F], R[,1:3])[,-2])
)Joining, by = "order_id"
Joining, by = "order_id"
df00 = group_by(df00, product_id) %>% summarise(
avgScore = mean(review_score)
) %>% left_join(P[,c(1,3)])Joining, by = "product_id"
df00 = df00[complete.cases(df00),]
ggplot(df00, aes(x = df00$product_name_lenght, y = df00$avgScore, col = product_name_lenght)) +
geom_point(shape = 1, size = 1, alpha = I(1/5)) +
scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
stat_smooth(method = lm, se=F, col = "#FFA500", size=1) +
xlab("Product Name Lenght") + ylab("Review Score") +
scale_x_continuous(limits = c(5,76)) +
theme_light() + ggtitle("Product Name Lenght V.S Review Score")table(df00$product_name_lenght)
5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
2 1 2 2 8 5 7 13 16 11 28 35 37 51 72 79
21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
73 108 127 134 171 171 213 246 276 321 368 350 439 434 440 522
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
507 541 607 644 681 709 755 711 800 881 890 900 987 1039 1018 1259
53 54 55 56 57 58 59 60 61 62 63 64 66 67 68 69
1330 1439 1683 1675 1719 1887 2025 2182 65 65 515 59 1 1 1 1
72 76
1 1
cor.test(df00$avgScore, df00$product_name_lenght)
Pearson's product-moment correlation
data: df00$avgScore and df00$product_name_lenght
t = -0.633, df = 32300, p-value = 0.53
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.0144199 0.0073772
sample estimates:
cor
-0.0035217
🗿 商業分析:
■ 畫圖,圖看起來多集中在右上角,但原因是因為產品名長度40-60的產品數量較多,且回歸線看起來幾乎是平線,兩者看起來並無關係。
■ 接著用統計方法做測試,結果的確為不顯著。
💡 建議:
產品名稱長度與產品的評分高低關係並不高,因此建議新進賣家在命名時不必太在意名稱長短,只要記得將產品重點寫妥即可。
df01 = left_join(
unique(I[,c(1,3)]),
unique(left_join(O[,1,F], R[,1:3])[,-2])
)Joining, by = "order_id"
Joining, by = "order_id"
df01 = group_by(df01, product_id) %>% summarise(
avgScore = mean(review_score)
) %>% left_join(P[,c(1,4)])Joining, by = "product_id"
df01 = df01[complete.cases(df01),]
ggplot(df01, aes(x = df01$product_description_lenght, y = df01$avgScore, col = product_description_lenght)) +
geom_point(shape = 1, size = 1, alpha = I(1/5)) +
scale_color_gradient(low = "#0DBF8C", high = "#CCCC4D") +
stat_smooth(method = lm, se=F, col = "#FFA500", size=1) +
xlab("product_description_lenght") + ylab("Review Score") +
scale_x_continuous(limits = c(0,4000)) +
theme_light() + ggtitle("product_description_lenght V.S Review Score")head(sort(table(df01$product_description_lenght), decreasing = T), 1000)
404 729 651 703 184 236 303 352 375 246 314 339 486 298 595 216
94 86 66 66 65 65 63 62 60 58 57 55 55 54 54 53
289 382 297 224 254 444 296 381 734 818 330 695 235 238 385 396
53 53 52 51 51 51 50 50 50 50 49 49 48 48 48 48
192 273 354 365 217 222 264 267 275 280 341 361 409 247 293 364
47 47 47 47 46 46 46 46 46 46 46 46 46 45 45 45
702 223 255 348 410 418 919 198 220 233 240 259 278 284 360 372
45 44 44 44 44 44 44 43 43 43 43 43 43 43 43 43
394 423 458 487 215 249 269 362 414 416 471 186 218 302 315 437
43 43 43 43 42 42 42 42 42 42 42 41 41 41 41 41
438 473 569 624 696 207 231 300 305 309 313 324 395 411 509 534
41 41 41 41 41 40 40 40 40 40 40 40 40 40 40 40
540 679 209 212 225 234 244 262 317 331 373 413 419 422 451 499
40 40 39 39 39 39 39 39 39 39 39 39 39 39 39 39
500 589 599 646 123 179 205 208 219 265 322 327 389 466 555 561
39 39 39 39 38 38 38 38 38 38 38 38 38 38 38 38
580 189 252 270 271 282 326 338 383 401 463 493 510 558 567 582
38 37 37 37 37 37 37 37 37 37 37 37 37 37 37 37
586 638 718 731 777 1440 160 162 175 226 232 250 288 325 329 347
37 37 37 37 37 37 36 36 36 36 36 36 36 36 36 36
374 408 478 484 609 676 688 138 228 320 332 349 391 402 427 435
36 36 36 36 36 36 36 35 35 35 35 35 35 35 35 35
455 459 485 492 527 530 537 546 623 626 655 663 701 182 188 202
35 35 35 35 35 35 35 35 35 35 35 35 35 34 34 34
210 214 248 258 261 274 283 344 350 386 443 446 462 464 498 525
34 34 34 34 34 34 34 34 34 34 34 34 34 34 34 34
549 556 614 627 648 678 776 144 171 183 211 257 272 276 279 285
34 34 34 34 34 34 34 33 33 33 33 33 33 33 33 33
292 343 346 370 388 430 431 472 476 489 513 575 635 687 728 111
33 33 33 33 33 33 33 33 33 33 33 33 33 33 33 32
124 130 190 194 199 221 245 290 291 308 310 328 340 424 452 453
32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
465 497 502 517 519 551 557 566 568 594 617 629 640 664 677 680
32 32 32 32 32 32 32 32 32 32 32 32 32 32 32 32
867 157 204 227 287 321 355 357 379 406 407 412 421 449 518 538
32 31 31 31 31 31 31 31 31 31 31 31 31 31 31 31
554 574 598 602 622 660 704 721 752 156 167 180 196 230 241 242
31 31 31 31 31 31 31 31 31 30 30 30 30 30 30 30
251 286 306 336 377 380 392 398 400 417 425 428 442 470 501 503
30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
506 507 548 563 572 573 587 600 604 616 666 684 700 716 918 1156
30 30 30 30 30 30 30 30 30 30 30 30 30 30 30 30
147 268 301 304 312 318 335 342 345 353 359 390 393 403 405 440
29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29
508 512 515 516 528 576 581 603 608 610 636 641 683 770 789 803
29 29 29 29 29 29 29 29 29 29 29 29 29 29 29 29
886 126 140 153 169 187 243 253 334 351 371 456 461 474 477 523
29 28 28 28 28 28 28 28 28 28 28 28 28 28 28 28
532 553 559 565 578 591 593 597 601 605 606 665 675 740 750 121
28 28 28 28 28 28 28 28 28 28 28 28 28 28 28 27
150 200 294 295 316 369 432 434 439 450 457 488 529 531 545 547
27 27 27 27 27 27 27 27 27 27 27 27 27 27 27 27
562 571 632 661 670 673 706 709 864 94 125 197 237 260 281 311
27 27 27 27 27 27 27 27 27 26 26 26 26 26 26 26
363 366 415 445 447 475 479 490 495 520 524 535 585 596 607 631
26 26 26 26 26 26 26 26 26 26 26 26 26 26 26 26
647 669 672 686 723 751 805 815 840 868 1101 139 148 166 174 181
26 26 26 26 26 26 26 26 26 26 26 25 25 25 25 25
201 206 229 256 323 337 356 358 367 397 468 480 494 504 505 521
25 25 25 25 25 25 25 25 25 25 25 25 25 25 25 25
542 543 560 564 570 612 613 615 642 650 786 844 905 114 117 159
25 25 25 25 25 25 25 25 25 25 25 25 25 24 24 24
203 213 263 441 460 469 482 511 544 630 649 657 662 707 708 712
24 24 24 24 24 24 24 24 24 24 24 24 24 24 24 24
713 726 747 791 802 816 852 870 940 133 145 146 149 155 161 163
24 24 24 24 24 24 24 24 24 23 23 23 23 23 23 23
165 177 178 193 266 277 319 376 399 420 433 491 588 634 637 668
23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
671 681 691 693 719 722 727 749 757 783 801 817 831 882 890 893
23 23 23 23 23 23 23 23 23 23 23 23 23 23 23 23
897 909 920 956 1750 115 128 132 137 143 158 168 170 299 378 384
23 23 23 23 23 22 22 22 22 22 22 22 22 22 22 22
429 483 526 539 577 644 652 653 654 715 738 744 766 796 799 807
22 22 22 22 22 22 22 22 22 22 22 22 22 22 22 22
811 821 829 863 891 84 87 92 113 127 141 152 164 172 185 307
22 22 22 22 22 21 21 21 21 21 21 21 21 21 21 21
333 426 436 448 454 552 584 590 618 621 659 690 705 733 736 762
21 21 21 21 21 21 21 21 21 21 21 21 21 21 21 21
763 808 813 833 851 862 872 875 880 925 998 2010 116 119 151 191
21 21 21 21 21 21 21 21 21 21 21 21 20 20 20 20
387 496 536 541 592 674 685 692 697 699 710 711 739 754 790 837
20 20 20 20 20 20 20 20 20 20 20 20 20 20 20 20
856 865 866 871 874 896 906 921 955 1223 1359 1749 120 481 514 633
20 20 20 20 20 20 20 20 20 20 20 20 19 19 19 19
645 724 725 732 735 737 743 748 767 781 784 792 812 830 853 855
19 19 19 19 19 19 19 19 19 19 19 19 19 19 19 19
869 899 903 927 951 989 1006 1235 1311 154 522 579 583 619 620 628
19 19 19 19 19 19 19 19 19 18 18 18 18 18 18 18
698 730 764 779 787 826 828 832 850 915 916 922 923 929 937 971
18 18 18 18 18 18 18 18 18 18 18 18 18 18 18 18
993 1002 1009 1057 1058 1076 1405 89 100 108 112 122 135 176 195 533
18 18 18 18 18 18 18 17 17 17 17 17 17 17 17 17
643 682 720 742 756 761 769 785 798 800 806 814 824 825 827 877
17 17 17 17 17 17 17 17 17 17 17 17 17 17 17 17
878 892 902 904 913 941 978 982 1047 1146 1151 1177 95 106 131 134
17 17 17 17 17 17 17 17 17 17 17 17 16 16 16 16
142 467 639 658 689 694 746 772 795 797 804 809 823 835 838 841
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
849 854 883 900 910 933 934 963 964 1023 1043 1064 1069 1078 1099 1209
16 16 16 16 16 16 16 16 16 16 16 16 16 16 16 16
1212 1236 1290 1392 81 90 101 102 239 667 717 745 758 765 771 780
16 16 16 16 15 15 15 15 15 15 15 15 15 15 15 15
793 794 858 859 873 879 885 908 914 943 946 953 960 991 1003 1031
15 15 15 15 15 15 15 15 15 15 15 15 15 15 15 15
1054 1080 1132 1145 1186 1271 1313 1322 1343 1400 1649 85 104 118 129 136
15 15 15 15 15 15 15 15 15 15 15 14 14 14 14 14
173 368 550 611 656 741 755 759 760 768 782 810 819 820 834 836
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
847 901 917 926 928 931 935 950 952 954 961 966 979 983 1026 1066
14 14 14 14 14 14 14 14 14 14 14 14 14 14 14 14
1081 1096 1102 1130 1140 1143 1162 1168 1200 1360 1451 1541 2043 74 753 775
14 14 14 14 14 14 14 14 14 14 14 14 14 13 13 13
845 846 857 884 888 895 907 912 962 967 969 974 975 985 1000 1001
13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 13
1005 1007 1016 1018 1042 1053 1059 1087 1107 1112 1214 1219 1416 1988 2083 82
13 13 13 13 13 13 13 13 13 13 13 13 13 13 13 12
93 96 99 105 625 773 774 822 842 860 876 894 898 924 944 945
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
948 959 973 999 1048 1050 1074 1085 1095 1104 1105 1113 1123 1125 1135 1157
12 12 12 12 12 12 12 12 12 12 12 12 12 12 12 12
1170 1182 1193 1198 1202 1220 1241 1246 1283 1892 1953 1969 1987 58 68 71
12 12 12 12 12 12 12 12 12 12 12 12 12 11 11 11
73 110 778 839 848 861 939 965 968 981 992 996 1024 1028 1033 1034
11 11 11 11 11 11 11 11 11 11 11 11 11 11 11 11
1038 1045 1049 1056 1071 1072 1077 1084
11 11 11 11 11 11 11 11
cor.test(df01$avgScore, df01$product_description_lenght)
Pearson's product-moment correlation
data: df01$avgScore and df01$product_description_lenght
t = 4.89, df = 32300, p-value = 0.000001
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.016307 0.038088
sample estimates:
cor
0.027201
🗿 商業分析:
■ 畫圖,圖看起來多集中在左上角,但原因是因為產品描述長度1000以下的產品數量較多,但回歸線看起來有點正相關的趨勢。
■ 因此用統計方法做測試,結果為顯著。
💡 建議:
產品描述長度與評分高低是呈現正相關,因此建議新進賣家可盡量將產品描述豐富詳實,以提高自身產品評價。
df = group_by(R, order_id) %>% summarise(score = mean(review_score))
df = merge(O, df, by='order_id', all.x=T)
df <- na.omit(df)
df = mutate(df, duration=as.numeric(difftime(
order_delivered_customer_date, order_approved_at, units="days")))
cor.test(df$score, df$duration)
Pearson's product-moment correlation
data: df$score and df$duration
t = -110, df = 96500, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.34065 -0.32944
sample estimates:
cor
-0.33506
table(df$order_status)
canceled delivered
6 96455
df_canceled <- df %>% filter(order_status == "canceled") #1.計算貨運間隔時間與評分相關性檢定
#2.以order id為分組,算review score的平均數
#3.將order dataset與df用order id合併
#4.計算時間間隔,命名為duration,新增到df中
#5.皮爾森相關檢定,結果顯著
#6.篩出取消之訂單,有6筆訂單有送達,但最後取消,可能是已送達卻未取件(確定一下custumer_id有無重複)
#7.刪掉取消的6筆訂單summary(df) order_id customer_id order_status
Length:96461 Length:96461 Length:96461
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
order_purchase_timestamp order_approved_at
Min. :2016-09-15 12:16:38 Min. :2016-09-15 12:16:38
1st Qu.:2017-09-14 09:28:28 1st Qu.:2017-09-14 14:30:14
Median :2018-01-20 19:59:42 Median :2018-01-22 13:49:00
Mean :2018-01-01 23:53:26 Mean :2018-01-02 10:10:06
3rd Qu.:2018-05-05 18:33:24 3rd Qu.:2018-05-06 10:30:49
Max. :2018-08-29 15:00:37 Max. :2018-08-29 15:10:26
order_delivered_carrier_date order_delivered_customer_date
Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-18 16:52:19 1st Qu.:2017-09-25 22:31:59
Median :2018-01-24 16:19:03 Median :2018-02-02 19:50:56
Mean :2018-01-05 05:21:04 Mean :2018-01-14 13:17:13
3rd Qu.:2018-05-08 14:33:00 3rd Qu.:2018-05-15 23:08:54
Max. :2018-09-11 19:48:28 Max. :2018-10-17 13:22:46
order_estimated_delivery_date score duration
Min. :2016-10-04 00:00:00 Min. :1.00 Min. : -6.99
1st Qu.:2017-10-05 00:00:00 1st Qu.:4.00 1st Qu.: 6.31
Median :2018-02-16 00:00:00 Median :5.00 Median : 9.85
Mean :2018-01-25 17:33:14 Mean :4.14 Mean : 12.13
3rd Qu.:2018-05-28 00:00:00 3rd Qu.:5.00 3rd Qu.: 15.14
Max. :2018-10-25 00:00:00 Max. :5.00 Max. :208.50
df = filter(df,df$duration >0)
summary(df) order_id customer_id order_status
Length:96400 Length:96400 Length:96400
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
order_purchase_timestamp order_approved_at
Min. :2016-09-15 12:16:38 Min. :2016-09-15 12:16:38
1st Qu.:2017-09-14 09:06:45 1st Qu.:2017-09-14 13:30:21
Median :2018-01-20 18:22:49 Median :2018-01-22 13:45:02
Mean :2018-01-01 22:10:15 Mean :2018-01-02 08:22:00
3rd Qu.:2018-05-05 17:26:10 3rd Qu.:2018-05-06 00:58:29
Max. :2018-08-29 15:00:37 Max. :2018-08-29 15:10:26
order_delivered_carrier_date order_delivered_customer_date
Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-18 16:46:13 1st Qu.:2017-09-25 22:18:27
Median :2018-01-24 15:26:45 Median :2018-02-02 18:55:39
Mean :2018-01-05 03:38:43 Mean :2018-01-14 11:41:39
3rd Qu.:2018-05-08 14:21:00 3rd Qu.:2018-05-15 22:03:05
Max. :2018-09-11 19:48:28 Max. :2018-10-17 13:22:46
order_estimated_delivery_date score duration
Min. :2016-10-04 00:00:00 Min. :1.00 Min. : 0.008
1st Qu.:2017-10-05 00:00:00 1st Qu.:4.00 1st Qu.: 6.315
Median :2018-02-16 00:00:00 Median :5.00 Median : 9.857
Mean :2018-01-25 15:58:40 Mean :4.14 Mean : 12.139
3rd Qu.:2018-05-28 00:00:00 3rd Qu.:5.00 3rd Qu.: 15.145
Max. :2018-10-25 00:00:00 Max. :5.00 Max. :208.501
sd(df$duration) [1] 9.5126
library(EnvStats)
Attaching package: 'EnvStats'
The following objects are masked from 'package:stats':
predict, predict.lm
The following object is masked from 'package:base':
print.default
rosnerTest(df$duration)
Results of Outlier Test
-------------------------
Test Method: Rosner's Test for Outliers
Hypothesized Distribution: Normal
Data: df$duration
Sample Size: 96400
Test Statistics: R.1 = 20.642
R.2 = 20.671
R.3 = 19.293
Test Statistic Parameter: k = 3
Alternative Hypothesis: Up to 3 observations are not
from the same Distribution.
Type I Error: 5%
Number of Outliers Detected: 3
i Mean.i SD.i Value Obs.Num R.i+1 lambda.i+1 Outlier
1 0 12.139 9.5126 208.50 76095 20.642 5.019 TRUE
2 1 12.137 9.4916 208.34 10277 20.671 5.019 TRUE
3 2 12.135 9.4706 194.85 17876 19.293 5.019 TRUE
bench <- 15 + 1.5*IQR(df$duration)
bench #28.5[1] 28.245
df_outlier = df %>% filter(df$duration > 28.5)#讓df篩出偏離值
df = anti_join(df, df_outlier, by = "order_id")
which.max(df$duration) #確定偏離值已篩出[1] 11041
df$duration[629] #確定最大值仍小於28.5[1] 28.192
#1.發現duration有負值,篩出duration > 0 者
#summary(df) #to check negative value are filtered.
#sd = 9.5126
#篩出偏離值
#計算偏離值#新的df資料描述
summary(df)#mean score = 4.24 order_id customer_id order_status
Length:91387 Length:91387 Length:91387
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
order_purchase_timestamp order_approved_at
Min. :2016-10-03 09:44:50 Min. :2016-10-04 09:43:32
1st Qu.:2017-09-11 20:18:58 1st Qu.:2017-09-12 04:50:39
Median :2018-01-20 09:38:35 Median :2018-01-20 15:41:24
Mean :2018-01-02 10:36:16 Mean :2018-01-02 20:46:55
3rd Qu.:2018-05-08 21:50:49 3rd Qu.:2018-05-09 09:43:32
Max. :2018-08-29 15:00:37 Max. :2018-08-29 15:10:26
order_delivered_carrier_date order_delivered_customer_date
Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-14 13:58:53 1st Qu.:2017-09-21 17:36:09
Median :2018-01-23 21:52:51 Median :2018-01-31 22:28:39
Mean :2018-01-05 10:51:35 Mean :2018-01-13 11:23:04
3rd Qu.:2018-05-10 16:27:00 3rd Qu.:2018-05-17 20:18:50
Max. :2018-08-31 15:25:00 Max. :2018-09-11 00:37:34
order_estimated_delivery_date score duration
Min. :2016-10-27 00:00:00 Min. :1.00 Min. : 0.0077
1st Qu.:2017-10-03 00:00:00 1st Qu.:4.00 1st Qu.: 6.1535
Median :2018-02-15 00:00:00 Median :5.00 Median : 9.3555
Mean :2018-01-25 20:15:07 Mean :4.24 Mean :10.6084
3rd Qu.:2018-05-30 00:00:00 3rd Qu.:5.00 3rd Qu.:14.0860
Max. :2018-10-25 00:00:00 Max. :5.00 Max. :28.4995
#mean duration = 10.2,median = 9。
sd(df$duration) #df = 6[1] 6.012
#試算正一個標準差後的分數
df_1sd = df %>% filter(df$duration >= 15)
mean(df_1sd$score) #一個標準差後的平均分數為3.9[1] 3.9186
#計算眾數
df_00 = cbind(df$order_id,df$duration) %>% data.frame()
df_00$X2 = as.integer(as.character(df_00$X2))
df$duration = as.integer(as.character(df$duration))
#mode 01
names(table(df_00$X2))[table(df_00$X2)==max(table(df_00$X2))] #7[1] "7"
#計算眾數的平均數
library(dplyr)
df_modescore = filter(df,df$duration >= 7 & df$duration < 8)
mean(df_modescore$score) #4.3542[1] 4.3542
score2 <- round(df$score, digits = 0)
df <-cbind(df, score2)
df = df[,-12]
boxplot(formula=duration~score2,
data = df, xlab="score2", ylab = "duration", col ="lightblue")
🗿 商業分析:
■ 1.由相關檢定發現,(不論篩過偏離值與否)貨運時間間隔與滿意度評分有顯著相關,意即顧客等待時間越久,整體的滿意度會越低。
■ 2.(mode = 7)大部分時間間隔在7日左右,平均評分落在4.4分,高於平均4.24,代表多數時候,olist的訂單及貨運處理流程是令人滿意的。
■ 3.olist的出貨品質(到貨天數)是不穩定的嗎,標準差為6日左右,接近平均送達天數的一倍。而到貨天數不穩定會直接影響評論分數。
💡 建議:
標準差為6日左右,代表olist貨運間隔時間並不穩定,然此將直接攸關滿意度評分,因此建議olist從訂單處理、備貨、出送貨流程優化下手,以獲得更好的顧客回饋。